home *** CD-ROM | disk | FTP | other *** search
- #define max(i,j) ((i)>(j)?(i):(j))
- #define min(i,j) ((i)<(j)?(i):(j))
- #ifdef __STDC__
- #include <math.h> /* include this if available */
- #include <limits.h>
- #define FABSMASK INT_MAX
- #else
- #define FABSMASK 0x7fffffff
- #endif
- #ifndef fabsf
- #ifndef fabs
- double fabs();
- #endif
- #define fabsf(x) fabs((double)x)
- #endif
- float lufact(a,n,indx)
- float **a; /* a[1..n][1..n] as in Press, Flannery et al */
- int n,indx[]; /* return indx[1..n],factored a[][] */
- {
- static float *p,*p2,*p3,accest;
- float *scale=(float *)&indx[1-sizeof(float)/sizeof(int)];
- /* put first scale[] then indx[] in caller's array */
- static union intflt{float f;int i;} xmax,xx;
- /* smart compilers will use register variables */
- static int i,j,k,ipiv,newmax;
- static double d,atemp,atmp2;
- for(i=1;i<=n;++i){ /* determine scale of each row */
- xmax.f=fabsf(a[i][1]);
- for(j=2;j<=n;++j){
- #ifdef ICMP /* if comparison is to be done by int arithmetic,
- ** noting that only + numbers are possible */
- xx.f=a[i][j];
- /* consistent style so that this loop will not dominate
- ** time of the other search loop;
- ** pointer casting has been tried here,
- ** but does not agree with compiler optimizers */
- xmax.i=max(xx.i&FABSMASK,xmax.i);
- #else
- xmax.f=max(fabsf(a[i][j]),xmax.f);
- #endif
- }
- /* except for this assignment,
- ** compilers could choose concurrent outer code
- ** (assign each loop iteration to independent process) */
- scale[i]=1/xmax.f;
- }
- accest=1; /* initialize to "no loss of accuracy" */
- for(j=2;j<=n;++j){ /* main factorization loop */
- xmax.f=0.; /* search down column for max scaled pivot */
- for(i=j-1;i<=n;++i){
- #ifdef ICMP
- xx.f=a[i][j-1]*scale[i];
- /* give opportunity for pipelining by using ?
- ** instead of if(){} as preferred by certain compilers
- ** in 1988 */
- xmax.i=(newmax=(xx.i&=FABSMASK)>xmax.i)?xx.i:xmax.i;
- #else
- xmax.f=(newmax=(xx.f=fabsf(a[i][j-1]*scale[i]))>xmax.f)
- ?xx.f:xmax.f;
- #endif
- ipiv=newmax?i:ipiv;
- }
- p=a[ipiv];
- d=p[j-1]; /* this is the pivot element */
- a[ipiv]=a[j-1]; /* swap row pointers */
- a[j-1]=p;
- accest=min(accest,xmax.f); /* track cancellation error */
- scale[ipiv]=scale[j-1];
- indx[j-1] = ipiv; /* record changes for caller */
- /* column j and row j-1 calculation combined, unrolled into pairs */
- for(i=j;i<=n;i += 2){
- double sum11=0.,sum12=0.,sum21=0.,sum22=0.;
- p2=a[i];
- p3=a[i+1]; /* may point beyond a[n] */
- for(k=1;k<j-1;++k){
- /* use += in inner loop in case sumnn are not allocated to registers */
- sum11 += a[i][k]*(double)a[k][j];
- sum12 += a[i+1][k]*(double)a[k][j];
- sum21 += a[k][i]*(double)a[j-1][k];
- sum22 += a[k][i+1]*(double)a[j-1][k];
- }
- #if defined(__STDC__) && (FLT_ROUNDS>=0) /* rounded normal precision best */
- atemp=p2[j-1]/p[j-1];
- atmp2=p3[j-1]/p[j-1];
- #else /* force into double and encourage compiler to preinvert */
- atemp=p2[j-1]/d;
- atmp2=p3[j-1]/d;
- #endif
- /* reassociation of additions is OK for mixed precision, not for full double */
- p[i] -= sum21; /* completing upper factor */
- /* these results will be discarded in the case i==n */
- sum22 = p[i+1]-sum22;
- /* note i>=j ; observe sequence */
- sum12 = p3[j]-sum12-atmp2*p[j];
- /* storage assignments last to allow more parallel computation */
- p2[j] -= sum11+atemp*p[j];
- p2[j-1]=atemp;
- if(i==n)break;
- p[i+1]=sum22;
- p3[j-1]=atmp2;
- p3[j]=sum12;
- }
- }
- accest = min(fabsf(a[n][n]*scale[n]),accest);
- indx[n]=n;
- return accest;
- }